if (!require("pdftools")) install.packages("pdftools"); require("pdftools")
if (!require("stringr")) install.packages("stringr"); require("stringr")
if (!require("tidyverse")) install.packages("tidyverse"); require("tidyverse")
if (!require("tokenizers")) install.packages("tokenizers"); require("tokenizers")
if (!require("pdfsearch")) install.packages("pdfsearch"); require("pdfsearch")
if (!require("haven")) install.packages("haven"); require("haven")
if (!require("janitor")) install.packages("janitor"); require("janitor")
if (!require("data.table")) install.packages("data.table"); require("data.table")

#Clean environment
rm(list = ls())
set.seed(11111001)

# ---- Define folders and relevant values ---- #
# Detect system and define main folder accordingly:
if (Sys.info()[["user"]]=="mglpurroy") {
  main_folder <- "C:/data/igni"
}

# # -- Input and Output folders -- #
input_data <- file.path(main_folder, "2input_data", "saginaw_18dec2023")


# -----------------------------------------------------------------------------
##### READ IN PDF ##### 
time = proc.time() 

# Set working directory 
setwd(input_data)

for (year in 2015:2023) {
# # Extract text from pdf  # TRY TO AVOID RUNNING THIS IF POSSIBLE -- Takes a while
  pdf_name <- paste("Bookings ",year,".pdf", sep = "") 
  pdf.text = pdftools::pdf_text(file.path(input_data, pdf_name))
  pdf.text = tibble(pdf.text)
  data = pdf.text
 
  out_name <- paste("booking_ ",year, sep = "") 
  saveRDS(data, out_name)

  # -----------------------------------------------------------------------------
  # Read in data 
  setwd(input_data)
  data = readRDS(out_name)
  
  # Rename column 
  data = data %>% 
    rename(
      text = pdf.text
    )

  # -----------------------------------------------------------------------------
  ##### Extract information and structure it
  npages <- nrow(data)
  df <- list()
  
  for (j in 1:npages) {
    ## load page
    texts <- data$text[j]
    
    ## separate rows
    rows<-scan(textConnection(texts), 
               what="character", sep = "\n")
    
    ## create columns and split them by pattern
    for (i in 1:length(rows)){
      nl <- length(df)
      nposl <- i + nl
      df[[nposl]] <- t(matrix(unlist(strsplit(rows[i]," \\s+ "))))
    }
  }
  
  # merging items of the list
  merged_df <- map_dfr(df, ~ as.data.frame(.x))
  
  out_file <- paste("bookrecord_",year,"_extracted.csv", sep = "") 
  write.csv(merged_df, file = file.path(input_data, out_file))
}

#-------------------------------

proc.time() - time 
